#/******************************************************************************
#*
#* Freescale Semiconductor Inc.
#* (c) Copyright 2004-2006 Freescale Semiconductor, Inc.
#* ALL RIGHTS RESERVED.
#*
#*******************************************************************************
#*
#* $File Name:  fft_radix4_frac32.s$
#*
#* $Date:       Jun-21-2006$
#*
#* $Version:    1.0.1.0$
#*
#* Description: N-point radix4 complex to complex frac16/frac32 in-place FFT,
#*              data ordering: real, imag, real, imag, ...,
#*              input data 16-bit fractional stored in second half of inout_buffer,
#*              output data 32-bit fractional,
#*              there must be 4*ONE_ITEM_BYTESIZE bytes of readable memory
#*              behind inout_buffer
#*
#*
#*   void fft_radix4_frac32(unsigned int N, int *inout_buffer, int *twiddle_factor_table);
#*       N - FFT length (must be even power of two, i.e. 64, 256, 1024, 4096)
#*
#*******************************************************************************/

    .text
    .align  5
    .globl fft_radix4_frac32
fft_radix4_frac32:

#/* configurable constants */
DATA_TYPE:    .set                2    #/* 0 - float/float/float, 1 - frac32/frac32/frac32, 2 - frac16/frac32/frac32, 3 - frac16/frac32/frac16, note: in/out/w_table */
SCALING:    .set                  1    #/* 0 - off, 1 - on */
#/* end of configurable constants */

#/* constants */
  .if DATA_TYPE == 0
SCALING:    .set                  0    #/* no scaling if float version */
  .endif
  .if DATA_TYPE == 3
W_TABLE_SIZE:    .set             1    #/* 1 - 16bit */
  .else
W_TABLE_SIZE:    .set             0    #/* 0 - 32bit */
  .endif
STAGE_SCALE_FACTOR:    .set            2    #/* scale_factor = 2^-STAGE_SCALE_FACTOR */
ONE_ITEM_BYTESIZE:    .set             8
ONE_ITEM_BYTESIZE_EXP:    .set         3    #/* 2^ONE_ITEM_BYTESIZE_EXP = ONE_ITEM_BYTESIZE */
ADDR_ADJ:    .set                      (4*ONE_ITEM_BYTESIZE)
  .if DATA_TYPE >= 2
ONE_ITEM_BYTESIZE_1ST:    .set         4
ONE_ITEM_BYTESIZE_1ST_EXP:    .set     2
  .else
ONE_ITEM_BYTESIZE_1ST:    .set         8
ONE_ITEM_BYTESIZE_1ST_EXP:    .set     3
  .endif
ADDR_ADJ_1ST:    .set                  (4*ONE_ITEM_BYTESIZE_1ST)
  .if W_TABLE_SIZE == 1
W_TABLE_1ITEM_SIZE:    .set            4
W_TABLE_1ITEM_SIZE_EXP:    .set        2    #/* 2^ONE_ITEM_BYTESIZE_EXP = ONE_ITEM_BYTESIZE */
  .else
W_TABLE_1ITEM_SIZE:    .set            8
W_TABLE_1ITEM_SIZE_EXP:    .set        3
  .endif
#/* end of constants */

#/* macros */
evmadd: .macro a,b,c
            .if     DATA_TYPE == 0
                evfsadd  a,b,c
            .else
                evaddw   a,b,c
            .endif
        .endm
evmsub: .macro a,b,c
            .if     DATA_TYPE == 0
                evfssub  a,b,c
            .else
                evsubfw  a,c,b
            .endif
        .endm
evmmul: .macro a,b,c
            .if     DATA_TYPE == 0
                evfsmul  a,b,c
            .else
                evmwhssf a,b,c
            .endif
        .endm
evscale: .macro a,b
            .if     SCALING == 0
            .else
                evsrwis a,a,b
            .endif
        .endm
evld_w: .macro a,b
            .if  W_TABLE_SIZE == 0
                evldw  a,b
            .else
                evlwhe a,b
            .endif
        .endm
evld_in: .macro a,b
            .if  ONE_ITEM_BYTESIZE_1ST == 8
                evldw  a,b
            .else
                evlwhe a,b
            .endif
        .endm

#/* code start */
    #/* store nonvolatile registers */
    stwu        r1, -160(r1);
    evstdd      r31, 152(r1);
    evstdd      r30, 144(r1);
    evstdd      r29, 136(r1);
    evstdd      r28, 128(r1);
    evstdd      r27, 120(r1);
    evstdd      r26, 112(r1);
    evstdd      r25, 104(r1);
    evstdd      r24, 96(r1);
    evstdd      r23, 88(r1);
    evstdd      r22, 80(r1);
    evstdd      r21, 72(r1);
    evstdd      r20, 64(r1);
    evstdd      r19, 56(r1);
    evstdd      r18, 48(r1);
    evstdd      r17, 40(r1);
    evstdd      r16, 32(r1);
    evstdd      r15, 24(r1);
    evstdd      r14, 16(r1);

#<#/* registers for first radix-4 stage */
N:    .set     3;
y:    .set     4;
w:    .set     5;
out_ptr:    .set     6;
out_ptr1:    .set     22;

x0:    .set     7;
x1:    .set     8;
x2:    .set     9;
x3:    .set     10;

x02:    .set     11;
x13:    .set     12;
x02m:    .set     x0;
x13m:    .set     x3;

y0:    .set     x2;
y1:    .set     x02;
y2:    .set     x02;
y3:    .set     14;

y13:    .set     x13;
y31:    .set     x3;

flip_x13m:    .set      x3;


x0a:    .set     15;
x1a:    .set     16;
x2a:    .set     17;
x3a:    .set     18;

x02a:    .set     19;
x13a:    .set     20;
x02ma:    .set     x0a;
x13ma:    .set     x3a;

y0a:    .set     x2a;
y1a:    .set     x02a;
y2a:    .set     x02a;
y3a:    .set     21;

y13a:    .set     x13a;
y31a:    .set     x3a;

flip_x13ma:    .set     x3a;
#>#

#/* code for first radix-4 stage */
  .if DATA_TYPE >= 2
     slwi         r0,r3,ONE_ITEM_BYTESIZE_EXP-1;
     add          r22,r4,r0;        #/* out_ptr1 = y_load  = y + N/2 * ONE_ITEM_BYTESIZE; */
  .else
out_ptr1:    .set     y
  .endif
     evld_in      r7, 0(r22);
     mr           r6,r4;            #/* out_ptr = y_store = y; */
     evld_in      r9, 2*ONE_ITEM_BYTESIZE_1ST(r22);
     evscale      r7, STAGE_SCALE_FACTOR;
     srwi         r0,r3,3;                          #/* ctr = N/4/2 */
     evld_in      r8, ONE_ITEM_BYTESIZE_1ST(r22);
     evscale      r9, STAGE_SCALE_FACTOR;
  .if DATA_TYPE == 0
     nop          #/* nop used for code alignment */
  .endif
     evld_in      r10, 3*ONE_ITEM_BYTESIZE_1ST(r22);
     evscale      r8, STAGE_SCALE_FACTOR;
     mtctr        r0;
     evscale      r10, STAGE_SCALE_FACTOR;
     evmadd       r11,  r7, r9;
     evmadd       r12,  r8, r10;
initial_loop:
  .if DATA_TYPE >= 2
  .else
out_ptr1:    .set     out_ptr
  .endif
     #/* there must be 4*ONE_ITEM_BYTESIZE bytes of readable memory behind input buffer */
     evmsub       r7, r7, r9;
     evld_in      r18, ADDR_ADJ_1ST+3*ONE_ITEM_BYTESIZE_1ST(r22);
     evmsub       r10, r8, r10;
     evmadd       r9,  r11, r12;
     evscale      r18, STAGE_SCALE_FACTOR;
     evld_in      r15, ADDR_ADJ_1ST+0(r22);
     evmsub       r11,  r11, r12;
     evld_in      r17, ADDR_ADJ_1ST+2*ONE_ITEM_BYTESIZE_1ST(r22);
     evscale      r15, STAGE_SCALE_FACTOR;
     evmergelohi  r10, r10, r10;
     evscale      r17, STAGE_SCALE_FACTOR;
     evld_in      r16, ADDR_ADJ_1ST+ONE_ITEM_BYTESIZE_1ST(r22);
     evmadd       r12,  r7, r10;
     evmsub       r10,  r7, r10;
     evscale      r16, STAGE_SCALE_FACTOR;

     evstdw       r9, 0(r6);
     evmadd       r19,  r15, r17;
     evstdw       r11, 2*ONE_ITEM_BYTESIZE(r6);

     evmergehilo  r11, r12, r10;
     evstdw       r11, ONE_ITEM_BYTESIZE(r6);
     evmergehilo  r14, r10, r12;
     evstdw       r14, 3*ONE_ITEM_BYTESIZE(r6);
     evmadd       r20,  r16, r18;


     evmsub       r15, r15, r17;
     evld_in      r10, 2*4*ONE_ITEM_BYTESIZE_1ST+3*ONE_ITEM_BYTESIZE_1ST(r22);
     evmsub       r18, r16, r18;
     evmadd       r17,  r19, r20;
     evscale      r10, STAGE_SCALE_FACTOR;
     evld_in      r7, 2*4*ONE_ITEM_BYTESIZE_1ST+0(r22);
     evmsub       r19,  r19, r20;
     evld_in      r9, 2*4*ONE_ITEM_BYTESIZE_1ST+2*ONE_ITEM_BYTESIZE_1ST(r22);
     evscale      r7, STAGE_SCALE_FACTOR;
     evmergelohi  r18, r18, r18;
     evscale      r9, STAGE_SCALE_FACTOR;
     evld_in      r8, 2*4*ONE_ITEM_BYTESIZE_1ST+ONE_ITEM_BYTESIZE_1ST(r22);
     evmadd       r20,  r15, r18;
     evmsub       r18,  r15, r18;
     evscale      r8, STAGE_SCALE_FACTOR;

     evstdw       r17, ADDR_ADJ+0(r6);
     evmadd       r11,  r7, r9;
     evstdw       r19, ADDR_ADJ+2*ONE_ITEM_BYTESIZE(r6);

     evmergehilo  r21, r18, r20;
     evstdw       r21, ADDR_ADJ+3*ONE_ITEM_BYTESIZE(r6);
     evmergehilo  r19, r20, r18;
     evstdw       r19, ADDR_ADJ+ONE_ITEM_BYTESIZE(r6);
     evmadd       r12,  r8, r10;


     addi         r6, r6, 2*4*ONE_ITEM_BYTESIZE;
  .if DATA_TYPE >= 2
     addi         r22, r22, 2*4*ONE_ITEM_BYTESIZE_1ST;
  .endif
     bdnz         initial_loop;


#<#/* registers for next radix-4 stages */
N:    .set     3
y:    .set     4
w:    .set     5
wy:    .set     N

w1adr:    .set     29
w2adr:    .set     30
w3adr:    .set     31

radr:    .set      6
r1adr:    .set      7
r2adr:    .set      8
r3adr:    .set      9

w1add_inc:    .set     10
w2add_inc:    .set     11
w3add_inc:    .set     12

pocet_motx8:    .set     14
section_loop_ctr:    .set     28
tmpx8:    .set      0

#/* registers for radix-4 btrfly */
#/* 15 - 27, 5, 4, 14 available */
x1:    .set     14
x5:    .set     16
w1:    .set     17
w5:    .set     18
x15_re:    .set     22
x15_im:    .set     x1
w15_re:    .set     19
w15_im:    .set     w5

tmp0:    .set     20
tmp1:    .set     21
tmp2:    .set     w15_im
tmp3:    .set     w15_re


x2:    .set     15
x6:    .set     16
w2:    .set     17
w6:    .set     18
w26_re:    .set     19
w26_im:    .set     w6


x3:    .set     15
x7:    .set     16
w3:    .set     17
w7:    .set     18
w37_re:    .set     19
w37_im:    .set     w7

x0:    .set     15
x4:    .set     16

x37_re:    .set     23
x37_im:    .set     24
x26_re:    .set     25
x26_im:    .set     26
x04_re:    .set     17
x04_im:    .set     x0



x15_re_n:    .set     22
x15_im_n:    .set     27
x37_re_n:    .set     20
x37_im_n:    .set     21
x26_re_n:    .set     25
x26_im_n:    .set     26
x04_re_n:    .set     17
x04_im_n:    .set     x0


x0246_re:    .set     16
x1357_re:    .set      4
x0246_im:    .set     18
x1357_im:    .set     19

y04_re:    .set     23
y26_re:    .set     x0246_re
y04_im:    .set     24
y26_im:    .set     x0246_im

y0:    .set      5
y4:    .set     y04_re
y2:    .set     14
y6:    .set     5


x0246_re_m:    .set     16
x1357_re_m:    .set      4
x0246_im_m:    .set     18
x1357_im_m:    .set     19

y15_re:    .set     23
y37_re:    .set     x0246_re_m
y15_im:    .set     24
y37_im:    .set     x0246_im_m

y1:    .set     17
y3:    .set     20
y5:    .set     y15_re
y7:    .set     y37_re
#>#

#/* code for next radix-4 stages */
    li         r0,1;                     #/* section_loop_ctr_init_value = 1 */
    evmergelo  r8,r0,r8;           #/* r2adr = (section_loop_ctr_init_value, r2adr) */

    srwi       r0,r3,4;                   #/* pocet_sek=N/16 */
    evmergelo  r6,r0,r6;             #/* radr = (pocet_sek, radr) */

    cntlzw     r0,r3;
    subfic     r0,r0,31;                 #/* 2^n=N, n=31-num_of_leading_zeros */
    srwi       r0,r0,1;
    evmergelo  r7,r0,r7;           #/* r1adr = (n/2, r1adr) */

    evmergelo  r3, r5, r4;                 #/* pack w and y to one register */

    li         r14,ONE_ITEM_BYTESIZE*4;    #/* pocet_motx8 */
    evmergelo  r0,r14,r0;  #/* tmpx8 = pocet_motx8, tmpx8 */

    #//li         r15,ONE_ITEM_BYTESIZE*4*4;       #/* index*8 */
    #//evmergelo  w2adr,r15,r15;          #/* w2adr = index*8 */
    evslwi     r30,r0,2;

    #//li         r15,2;                     #/* stupen_loop_ctr */
    #//evmergelo  w1adr,r15,r15;             #/* w1adr = (stupen_loop_ctr, w1adr) */
    evsrwiu    r29,r0,ONE_ITEM_BYTESIZE_EXP+1;

stupen_loop:
    evmergehi  r5,r6,r6;                   #/* r5 = pocet_sek */
    li         r0,0;
    evor       r28,r8, r8;         #/* section_loop_ctr (high portion) = 1 */
    slwi       r10,r5,W_TABLE_1ITEM_SIZE_EXP;    #/* pocet_sek*8 */
    add        r11,r10,r10;  #/* pocet_sek*2*8 */
    add        r12,r11,r10;  #/* pocet_sek*3*8 */
section_loop:
    evmergehi  r29, r29, r3;
    evmergehi  r30, r30, r3;
    evmergehi  r31, r31, r3;
    add        r6,r3,r0;            #/* radr  = y + tmpx8 */
    evmergehi  r14,r0,r0;  #/* extract pocet_motx8 from tmpx8 */
    add        r7,r6,r14;   #/* r1adr = y + (tmpx8 +   pocet_motx8) */
    add        r8,r7,r14;  #/* r2adr = y + (tmpx8 + 2*pocet_motx8) */
    evldw      r16, ONE_ITEM_BYTESIZE+0(r7);
    add        r9,r8,r14;  #/* r3adr = y + (tmpx8 + 3*pocet_motx8) */
    srwi       r5,r14,ONE_ITEM_BYTESIZE_EXP+2;    #/* motylek_loop_ctr = pocet_mot/4 */
    evldw      r14, 0(r7);
    evscale    r16, STAGE_SCALE_FACTOR;
    mtctr      r5;
    evscale    r14, STAGE_SCALE_FACTOR;
motylek_loop:
#/* first pair of radix-4 butterflies */
     #/* multiplication with twiddle factors */
     #/* 1,5 */
     evldw        r17, 0(r29);
     add          r29,r29,r10;
     evldw        r18, 0(r29);

     evmergehi    r22, r14, r16;
     evmergelo    r14, r14, r16;
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r22, r19;
     evmmul       r21, r14, r18;
     evmmul       r18, r22, r18;
     evmmul       r19, r14, r19;

     evldw        r15, 0(r8);
     evmsub       r22, r20, r21;
     evmadd       r27, r18, r19;
     evscale      r15, STAGE_SCALE_FACTOR;


     #/* 2,6 */
     evldw        r16, ONE_ITEM_BYTESIZE+0(r8);
     evldw        r17, 0(r30);
     add          r30,r30,r11;
     evscale      r16, STAGE_SCALE_FACTOR;
     evldw        r18, 0(r30);

     evmergehi    r25, r15, r16;
     evmergelo    r26, r15, r16;
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r25, r19;
     evmmul       r21, r26, r18;
     evmmul       r18, r25, r18;
     evmmul       r19, r26, r19;

     evldw        r15, 0(r9);
     evmsub       r25, r20, r21;
     evmadd       r26, r18, r19;
     evscale      r15, STAGE_SCALE_FACTOR;


     #/* 3,7 */
     evldw        r16, ONE_ITEM_BYTESIZE+0(r9);
     evldw        r17, 0(r31);
     add          r31,r31,r12;
     evscale      r16, STAGE_SCALE_FACTOR;
     evldw        r18, 0(r31);

     evmergehi    r23, r15, r16;
     evmergelo    r24, r15, r16;
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r23, r19;
     evmmul       r21, r24, r18;
     evmmul       r18, r23, r18;
     evmmul       r19, r24, r19;

     evldw        r15, 0(r6);
     evldw        r16, ONE_ITEM_BYTESIZE+0(r6);
     evmsub       r20, r20, r21;
     evscale      r15, STAGE_SCALE_FACTOR;
     evscale      r16, STAGE_SCALE_FACTOR;
     evmadd       r21, r18, r19;


     #/* 0,4 */
     evmergehi    r17, r15, r16;
     evmergelo    r15, r15, r16;


     #/* radix-4 butterfly */
     evmadd       r16, r17, r25;
     evmadd       r4, r22, r20;
     evmadd       r18, r15, r26;
     evmadd       r19, r27, r21;

     evmadd       r23, r16, r4;
     evmsub       r16, r16, r4;
     evmadd       r24, r18, r19;
     evmsub       r18, r18, r19;


     add          r29,r29,r10;
     evmergehi    r5, r23, r24;
     evmergelo    r23, r23, r24;
     evmergehi    r14, r16, r18;
     evstdw       r5, 0(r6);
     evmergelo    r5, r16, r18;

     evstdw       r23, ONE_ITEM_BYTESIZE+0(r6);
     evstdw       r14, 0(r8);
     evstdw       r5, ONE_ITEM_BYTESIZE+0(r8);


     evmsub       r16, r17, r25;
     evmsub       r19, r27, r21;
     evmsub       r18, r15, r26;
     evmsub       r4, r22, r20;

     evmadd       r23, r16, r19;
     evmsub       r16, r16, r19;
     evmsub       r24, r18, r4;
     evmadd       r18, r18, r4;

     evldw        r14, 2*ONE_ITEM_BYTESIZE+0(r7);
     evmergehi    r17, r23, r24;
     evmergelo    r23, r23, r24;
     evscale      r14, STAGE_SCALE_FACTOR;
     evmergehi    r20, r16, r18;
     evstdw       r17, 0(r7);
     evmergelo    r16, r16, r18;

     evstdw       r20, 0(r9);
     add          r31,r31,r12;
     evstdw       r16, ONE_ITEM_BYTESIZE+0(r9);


#//    addi     radr,radr,2*ONE_ITEM_BYTESIZE;
#//    addi     r1adr,r1adr,2*ONE_ITEM_BYTESIZE;
#//    addi     r2adr,r2adr,2*ONE_ITEM_BYTESIZE;
#//    addi     r3adr,r3adr,2*ONE_ITEM_BYTESIZE;


#/* second pair of radix-4 butterflies */
     #/* multiplication with twiddle factors */
     #/* 1,5 */
     add          r30,r30,r11;
     evldw        r16, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r7);
     evldw        r17, 0(r29);
     add          r29,r29,r10;
     evscale      r16, STAGE_SCALE_FACTOR;
     evldw        r18, 0(r29);

     evmergehi    r22, r14, r16;
     evmergelo    r14, r14, r16;
     evstdw       r23, ONE_ITEM_BYTESIZE+0(r7);
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r22, r19;
     evmmul       r21, r14, r18;
     evmmul       r18, r22, r18;
     evmmul       r19, r14, r19;

     evldw        r15, 2*ONE_ITEM_BYTESIZE+0(r8);
     evmsub       r22, r20, r21;
     evmadd       r27, r18, r19;
     evscale      r15, STAGE_SCALE_FACTOR;


     #/* 2,6 */
     evldw        r16, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r8);
     evldw        r17, 0(r30);
     add          r30,r30,r11;
     evscale      r16, STAGE_SCALE_FACTOR;
     evldw        r18, 0(r30);

     evmergehi    r25, r15, r16;
     evmergelo    r26, r15, r16;
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r25, r19;
     evmmul       r21, r26, r18;
     evmmul       r18, r25, r18;
     evmmul       r19, r26, r19;

     evldw        r15, 2*ONE_ITEM_BYTESIZE+0(r9);
     evmsub       r25, r20, r21;
     evmadd       r26, r18, r19;
     evscale      r15, STAGE_SCALE_FACTOR;


     #/* 3,7 */
     evldw        r16, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r9);
     evldw        r17, 0(r31);
     add          r31,r31,r12;
     evscale      r16, STAGE_SCALE_FACTOR;
     evldw        r18, 0(r31);

     evmergehi    r23, r15, r16;
     evmergelo    r24, r15, r16;
     evmergehi    r19, r17, r18;
     evmergelo    r18, r17, r18;

     evmmul       r20, r23, r19;
     evmmul       r21, r24, r18;
     evmmul       r18, r23, r18;
     evmmul       r19, r24, r19;

     evldw        r15, 2*ONE_ITEM_BYTESIZE+0(r6);
     evldw        r16, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r6);
     evmsub       r20, r20, r21;
     evscale      r15, STAGE_SCALE_FACTOR;
     evscale      r16, STAGE_SCALE_FACTOR;
     evmadd       r21, r18, r19;


     #/* 0,4 */
     evmergehi    r17, r15, r16;
     evmergelo    r15, r15, r16;


     #/* radix-4 butterfly */
     evmadd       r16, r17, r25;
     evmadd       r4, r22, r20;
     evmadd       r18, r15, r26;
     evmadd       r19, r27, r21;

     evmadd       r23, r16, r4;
     evmsub       r16, r16, r4;
     evmadd       r24, r18, r19;
     evmsub       r18, r18, r19;


     add          r29,r29,r10;
     evmergehi    r5, r23, r24;
     evmergelo    r23, r23, r24;
     evmergehi    r14, r16, r18;
     evstdw       r5, 2*ONE_ITEM_BYTESIZE+0(r6);
     evmergelo    r5, r16, r18;

     evstdw       r23, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r6);
     evstdw       r14, 2*ONE_ITEM_BYTESIZE+0(r8);
     evstdw       r5, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r8);



     evmsub       r16, r17, r25;
     evmsub       r19, r27, r21;
     evmsub       r18, r15, r26;
     evmsub       r4, r22, r20;

     evmadd       r23, r16, r19;
     evmsub       r16, r16, r19;
     evmsub       r24, r18, r4;
     evmadd       r18, r18, r4;

     evldw        r14, 4*ONE_ITEM_BYTESIZE+0(r7);
     evmergehi    r17, r23, r24;
     evmergelo    r23, r23, r24;
     evscale      r14, STAGE_SCALE_FACTOR;
     evmergehi    r20, r16, r18;
     evstdw       r17, 2*ONE_ITEM_BYTESIZE+0(r7);
     evmergelo    r16, r16, r18;

     evstdw       r20, 2*ONE_ITEM_BYTESIZE+0(r9);
     add          r30,r30,r11;
     evstdw       r23, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r7);
     add          r31,r31,r12;
     evstdw       r16, 2*ONE_ITEM_BYTESIZE+ONE_ITEM_BYTESIZE+0(r9);


    addi     r7,r7,4*ONE_ITEM_BYTESIZE;
    evldw    r16, ONE_ITEM_BYTESIZE+0(r7);
    addi     r8,r8,4*ONE_ITEM_BYTESIZE;
    addi     r9,r9,4*ONE_ITEM_BYTESIZE;
    evscale  r16, STAGE_SCALE_FACTOR;
    addi     r6, r6, 4*ONE_ITEM_BYTESIZE;
    bdnz     motylek_loop;
    #/* end of motylek_loop */

    evmergehi  r5,r30,r30;                #/* r5  = index*8 */
    evaddiw    r28,r28,1;
    add        r0,r0,r5;
    evcmpgtu   0,r28,r6;       #/* compare section_loop_ctr (in high portion of section_loop_ctr) greater than pocet_sek (in high portion of radr) */
    bc         4,0,section_loop;              #/* 0-bit in crf0, branch if condition is false */
    #/* end of section_loop */

    evsrwiu    r6,r6,2;              #/* pocet_sek = pocet_sek / 4 */
    evslwi     r0,r0,2;            #/* pocet_motx8 = pocet_motx8*4; tmpx8 low portion is zeroed at the stupen beginning */
    evslwi     r30,r30,2;            #/* index*8 = index*8 * 4 */
    evaddiw    r29,r29,1;            #/* stupen_loop_ctr = stupen_loop_ctr + 1; */
    evcmpgtu   0,r29,r7;            #/* compare stupen_loop_ctr (in high portion of w1adr) greater than n (in hirh portion of r1adr) */
    bc         4,0,stupen_loop;          #/* 0-bit in crf0, branch if condition is false */
    #/* end of stupen_loop */

    #/* restore nonvolatile registers */
    evldd      r31, 152(r1);
    evldd      r30, 144(r1);
    evldd      r29, 136(r1);
    evldd      r28, 128(r1);
    evldd      r27, 120(r1);
    evldd      r26, 112(r1);
    evldd      r25, 104(r1);
    evldd      r24, 96(r1);
    evldd      r23, 88(r1);
    evldd      r22, 80(r1);
    evldd      r21, 72(r1);
    evldd      r20, 64(r1);
    evldd      r19, 56(r1);
    evldd      r18, 48(r1);
    evldd      r17, 40(r1);
    evldd      r16, 32(r1);
    evldd      r15, 24(r1);
    evldd      r14, 16(r1);
    addi       r1, r1, 160;

    blr
